{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "toc": true
   },
   "source": [
    "<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n",
    "<div class=\"toc\"><ul class=\"toc-item\"><li><span><a href=\"#Voting\" data-toc-modified-id=\"Voting-1\"><span class=\"toc-item-num\">1&nbsp;&nbsp;</span>Voting</a></span></li><li><span><a href=\"#Bagging\" data-toc-modified-id=\"Bagging-2\"><span class=\"toc-item-num\">2&nbsp;&nbsp;</span>Bagging</a></span><ul class=\"toc-item\"><li><span><a href=\"#RandomForest\" data-toc-modified-id=\"RandomForest-2.1\"><span class=\"toc-item-num\">2.1&nbsp;&nbsp;</span>RandomForest</a></span></li></ul></li><li><span><a href=\"#Stacking\" data-toc-modified-id=\"Stacking-3\"><span class=\"toc-item-num\">3&nbsp;&nbsp;</span>Stacking</a></span></li><li><span><a href=\"#Blending\" data-toc-modified-id=\"Blending-4\"><span class=\"toc-item-num\">4&nbsp;&nbsp;</span>Blending</a></span></li><li><span><a href=\"#Boosting\" data-toc-modified-id=\"Boosting-5\"><span class=\"toc-item-num\">5&nbsp;&nbsp;</span>Boosting</a></span></li></ul></div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 模型融合实践"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**载入数据**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-21T09:07:59.900891Z",
     "start_time": "2019-09-21T09:07:59.363136Z"
    }
   },
   "outputs": [],
   "source": [
    "# 匹马印第安人糖尿病的数据集\n",
    "# 【1】Pregnancies：怀孕次数\n",
    "# 【2】Glucose：葡萄糖\n",
    "# 【3】BloodPressure：血压 (mm Hg)\n",
    "# 【4】SkinThickness：皮层厚度 (mm)\n",
    "# 【5】Insulin：胰岛素 2小时血清胰岛素（mu U / ml\n",
    "# 【6】BMI：体重指数 （体重/身高）^2\n",
    "# 【7】DiabetesPedigreeFunction：糖尿病谱系功能\n",
    "# 【8】Age：年龄 （岁）\n",
    "# 【9】Outcome：类标变量 （0或1）\n",
    "\n",
    "import pandas as pd\n",
    "names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']\n",
    "df = pd.read_csv('../data/pima/pima-indians-diabetes.csv', names=names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-21T09:08:03.576906Z",
     "start_time": "2019-09-21T09:08:03.562944Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>preg</th>\n",
       "      <th>plas</th>\n",
       "      <th>pres</th>\n",
       "      <th>skin</th>\n",
       "      <th>test</th>\n",
       "      <th>mass</th>\n",
       "      <th>pedi</th>\n",
       "      <th>age</th>\n",
       "      <th>class</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>6</td>\n",
       "      <td>148</td>\n",
       "      <td>72</td>\n",
       "      <td>35</td>\n",
       "      <td>0</td>\n",
       "      <td>33.6</td>\n",
       "      <td>0.627</td>\n",
       "      <td>50</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>85</td>\n",
       "      <td>66</td>\n",
       "      <td>29</td>\n",
       "      <td>0</td>\n",
       "      <td>26.6</td>\n",
       "      <td>0.351</td>\n",
       "      <td>31</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8</td>\n",
       "      <td>183</td>\n",
       "      <td>64</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>23.3</td>\n",
       "      <td>0.672</td>\n",
       "      <td>32</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>89</td>\n",
       "      <td>66</td>\n",
       "      <td>23</td>\n",
       "      <td>94</td>\n",
       "      <td>28.1</td>\n",
       "      <td>0.167</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>137</td>\n",
       "      <td>40</td>\n",
       "      <td>35</td>\n",
       "      <td>168</td>\n",
       "      <td>43.1</td>\n",
       "      <td>2.288</td>\n",
       "      <td>33</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   preg  plas  pres  skin  test  mass   pedi  age  class\n",
       "0     6   148    72    35     0  33.6  0.627   50      1\n",
       "1     1    85    66    29     0  26.6  0.351   31      0\n",
       "2     8   183    64     0     0  23.3  0.672   32      1\n",
       "3     1    89    66    23    94  28.1  0.167   21      0\n",
       "4     0   137    40    35   168  43.1  2.288   33      1"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-21T09:08:28.036289Z",
     "start_time": "2019-09-21T09:08:28.030305Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1, 0], dtype=int64)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 是一个二分类问题\n",
    "df['class'].unique()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Voting\n",
    "\n",
    "投票器模型融合\n",
    "\n",
    "认为模型与模型之间有差异，而单个模型很难控制过拟合。\n",
    "所以采用使用“使用多种模型，结果多数表决”方法。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-21T09:43:47.043947Z",
     "start_time": "2019-09-21T09:43:47.040926Z"
    }
   },
   "outputs": [],
   "source": [
    "from sklearn import model_selection\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.ensemble import VotingClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-21T09:43:52.643421Z",
     "start_time": "2019-09-21T09:43:52.640430Z"
    }
   },
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-21T09:16:40.113539Z",
     "start_time": "2019-09-21T09:16:40.109515Z"
    }
   },
   "outputs": [],
   "source": [
    "X = df.iloc[:,0:8]\n",
    "Y = df['class']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-21T09:16:45.045057Z",
     "start_time": "2019-09-21T09:16:45.037079Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>preg</th>\n",
       "      <th>plas</th>\n",
       "      <th>pres</th>\n",
       "      <th>skin</th>\n",
       "      <th>test</th>\n",
       "      <th>mass</th>\n",
       "      <th>pedi</th>\n",
       "      <th>age</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>6</td>\n",
       "      <td>148</td>\n",
       "      <td>72</td>\n",
       "      <td>35</td>\n",
       "      <td>0</td>\n",
       "      <td>33.6</td>\n",
       "      <td>0.627</td>\n",
       "      <td>50</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>85</td>\n",
       "      <td>66</td>\n",
       "      <td>29</td>\n",
       "      <td>0</td>\n",
       "      <td>26.6</td>\n",
       "      <td>0.351</td>\n",
       "      <td>31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8</td>\n",
       "      <td>183</td>\n",
       "      <td>64</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>23.3</td>\n",
       "      <td>0.672</td>\n",
       "      <td>32</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>89</td>\n",
       "      <td>66</td>\n",
       "      <td>23</td>\n",
       "      <td>94</td>\n",
       "      <td>28.1</td>\n",
       "      <td>0.167</td>\n",
       "      <td>21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>137</td>\n",
       "      <td>40</td>\n",
       "      <td>35</td>\n",
       "      <td>168</td>\n",
       "      <td>43.1</td>\n",
       "      <td>2.288</td>\n",
       "      <td>33</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   preg  plas  pres  skin  test  mass   pedi  age\n",
       "0     6   148    72    35     0  33.6  0.627   50\n",
       "1     1    85    66    29     0  26.6  0.351   31\n",
       "2     8   183    64     0     0  23.3  0.672   32\n",
       "3     1    89    66    23    94  28.1  0.167   21\n",
       "4     0   137    40    35   168  43.1  2.288   33"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-21T09:39:48.963009Z",
     "start_time": "2019-09-21T09:39:48.960017Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Help on function cross_val_score in module sklearn.model_selection._validation:\n",
      "\n",
      "cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv='warn', n_jobs=None, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', error_score='raise-deprecating')\n",
      "    Evaluate a score by cross-validation\n",
      "    \n",
      "    Read more in the :ref:`User Guide <cross_validation>`.\n",
      "    \n",
      "    Parameters\n",
      "    ----------\n",
      "    estimator : estimator object implementing 'fit'\n",
      "        The object to use to fit the data.\n",
      "    \n",
      "    X : array-like\n",
      "        The data to fit. Can be for example a list, or an array.\n",
      "    \n",
      "    y : array-like, optional, default: None\n",
      "        The target variable to try to predict in the case of\n",
      "        supervised learning.\n",
      "    \n",
      "    groups : array-like, with shape (n_samples,), optional\n",
      "        Group labels for the samples used while splitting the dataset into\n",
      "        train/test set.\n",
      "    \n",
      "    scoring : string, callable or None, optional, default: None\n",
      "        A string (see model evaluation documentation) or\n",
      "        a scorer callable object / function with signature\n",
      "        ``scorer(estimator, X, y)``.\n",
      "    \n",
      "    cv : int, cross-validation generator or an iterable, optional\n",
      "        Determines the cross-validation splitting strategy.\n",
      "        Possible inputs for cv are:\n",
      "    \n",
      "        - None, to use the default 3-fold cross validation,\n",
      "        - integer, to specify the number of folds in a `(Stratified)KFold`,\n",
      "        - :term:`CV splitter`,\n",
      "        - An iterable yielding (train, test) splits as arrays of indices.\n",
      "    \n",
      "        For integer/None inputs, if the estimator is a classifier and ``y`` is\n",
      "        either binary or multiclass, :class:`StratifiedKFold` is used. In all\n",
      "        other cases, :class:`KFold` is used.\n",
      "    \n",
      "        Refer :ref:`User Guide <cross_validation>` for the various\n",
      "        cross-validation strategies that can be used here.\n",
      "    \n",
      "        .. versionchanged:: 0.20\n",
      "            ``cv`` default value if None will change from 3-fold to 5-fold\n",
      "            in v0.22.\n",
      "    \n",
      "    n_jobs : int or None, optional (default=None)\n",
      "        The number of CPUs to use to do the computation.\n",
      "        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n",
      "        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`\n",
      "        for more details.\n",
      "    \n",
      "    verbose : integer, optional\n",
      "        The verbosity level.\n",
      "    \n",
      "    fit_params : dict, optional\n",
      "        Parameters to pass to the fit method of the estimator.\n",
      "    \n",
      "    pre_dispatch : int, or string, optional\n",
      "        Controls the number of jobs that get dispatched during parallel\n",
      "        execution. Reducing this number can be useful to avoid an\n",
      "        explosion of memory consumption when more jobs get dispatched\n",
      "        than CPUs can process. This parameter can be:\n",
      "    \n",
      "            - None, in which case all the jobs are immediately\n",
      "              created and spawned. Use this for lightweight and\n",
      "              fast-running jobs, to avoid delays due to on-demand\n",
      "              spawning of the jobs\n",
      "    \n",
      "            - An int, giving the exact number of total jobs that are\n",
      "              spawned\n",
      "    \n",
      "            - A string, giving an expression as a function of n_jobs,\n",
      "              as in '2*n_jobs'\n",
      "    \n",
      "    error_score : 'raise' | 'raise-deprecating' or numeric\n",
      "        Value to assign to the score if an error occurs in estimator fitting.\n",
      "        If set to 'raise', the error is raised.\n",
      "        If set to 'raise-deprecating', a FutureWarning is printed before the\n",
      "        error is raised.\n",
      "        If a numeric value is given, FitFailedWarning is raised. This parameter\n",
      "        does not affect the refit step, which will always raise the error.\n",
      "        Default is 'raise-deprecating' but from version 0.22 it will change\n",
      "        to np.nan.\n",
      "    \n",
      "    Returns\n",
      "    -------\n",
      "    scores : array of float, shape=(len(list(cv)),)\n",
      "        Array of scores of the estimator for each run of the cross validation.\n",
      "    \n",
      "    Examples\n",
      "    --------\n",
      "    >>> from sklearn import datasets, linear_model\n",
      "    >>> from sklearn.model_selection import cross_val_score\n",
      "    >>> diabetes = datasets.load_diabetes()\n",
      "    >>> X = diabetes.data[:150]\n",
      "    >>> y = diabetes.target[:150]\n",
      "    >>> lasso = linear_model.Lasso()\n",
      "    >>> print(cross_val_score(lasso, X, y, cv=3))  # doctest: +ELLIPSIS\n",
      "    [0.33150734 0.08022311 0.03531764]\n",
      "    \n",
      "    See Also\n",
      "    ---------\n",
      "    :func:`sklearn.model_selection.cross_validate`:\n",
      "        To run cross-validation on multiple metrics and also to return\n",
      "        train scores, fit times and score times.\n",
      "    \n",
      "    :func:`sklearn.model_selection.cross_val_predict`:\n",
      "        Get predictions from each split of cross-validation for diagnostic\n",
      "        purposes.\n",
      "    \n",
      "    :func:`sklearn.metrics.make_scorer`:\n",
      "        Make a scorer from a performance metric or loss function.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "help(model_selection.cross_val_score)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-21T09:47:48.792857Z",
     "start_time": "2019-09-21T09:47:48.625305Z"
    }
   },
   "outputs": [],
   "source": [
    "# 设置 k 折交叉验证\n",
    "kfold = model_selection.KFold(n_splits=5, random_state=10)\n",
    "\n",
    "# 创建投票器的子模型\n",
    "estimators = []\n",
    "model_1 = LogisticRegression()\n",
    "estimators.append(('logistic', model_1))\n",
    "\n",
    "model_2 = DecisionTreeClassifier()\n",
    "estimators.append(('dt', model_2))\n",
    "\n",
    "model_3 = SVC()\n",
    "estimators.append(('svm', model_3))\n",
    "\n",
    "# 构建投票器融合\n",
    "ensemble = VotingClassifier(estimators)\n",
    "\n",
    "# 交叉验证结果\n",
    "result = model_selection.cross_val_score(ensemble, X, Y ,scoring='accuracy',cv=kfold)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-21T09:47:49.252630Z",
     "start_time": "2019-09-21T09:47:49.248641Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.72077922, 0.66233766, 0.73376623, 0.82352941, 0.75163399])"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-21T09:44:05.940359Z",
     "start_time": "2019-09-21T09:44:05.936370Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.7435956200662084\n"
     ]
    }
   ],
   "source": [
    "print(result.mean())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Bagging\n",
    "\n",
    "1、模型效果不好的原因？\n",
    "欠拟合/过拟合\n",
    "\n",
    "2、如何缓解过拟合？\n",
    "少给点题，别让它直接把所有题目答案背下来\n",
    "多找几个同学做题，综合一下他们的答案\n",
    "\n",
    "1. 从原始样本集中抽取训练集。每轮从原始样本集中使用Bootstraping的方法抽取n个训练样本（在训练集中，有些样本可能被多次抽取到，而有些样本可能一次都没有被抽中）。共进行k轮抽取，得到k个训练集。（k个训练集之间是相互独立的）\n",
    "\n",
    "2. 每次使用一个训练集得到一个模型，k个训练集共得到k个模型。（注：这里并没有具体的分类算法或回归方法，我们可以根据具体问题采用不同的分类或回归方法，如决策树、感知器等）\n",
    "\n",
    "3. 对分类问题：将上步得到的k个模型采用投票的方式得到分类结果；对回归问题，计算上述模型的均值作为最后的结果。（所有模型的重要性相同）\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-21T09:52:55.989380Z",
     "start_time": "2019-09-21T09:52:55.986389Z"
    }
   },
   "outputs": [],
   "source": [
    "from sklearn.ensemble import BaggingClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-21T09:56:06.501946Z",
     "start_time": "2019-09-21T09:56:05.793863Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.7695866225277991\n"
     ]
    }
   ],
   "source": [
    "dt = DecisionTreeClassifier()\n",
    "num = 100\n",
    "# 设置 k 折交叉验证\n",
    "kfold = model_selection.KFold(n_splits=5, random_state=10)\n",
    "model = BaggingClassifier(base_estimator=dt, n_estimators=num, random_state=10)\n",
    "result = model_selection.cross_val_score(model, X, Y, cv=kfold)\n",
    "print(result.mean())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  RandomForest\n",
    "\n",
    "随机森林也是 bagging，对特征做了采样"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-21T09:58:48.537387Z",
     "start_time": "2019-09-21T09:58:47.987406Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.7526610644257703\n"
     ]
    }
   ],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "num_trees = 100\n",
    "max_feature_num = 5\n",
    "kfold = model_selection.KFold(n_splits=5, random_state=2018)\n",
    "model = RandomForestClassifier(n_estimators=num_trees, max_features=max_feature_num)\n",
    "result = model_selection.cross_val_score(model, X, Y, cv=kfold)\n",
    "print(result.mean())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-21T09:59:54.872786Z",
     "start_time": "2019-09-21T09:59:54.868767Z"
    }
   },
   "source": [
    "## Stacking\n",
    "\n",
    "用上层estimator结果作为下一层的特征。有点像是在公司里信息先传到各个部门经理那里，各个部门经理做出决策，再汇总到大boss。大boss根据不同的部门经理决策再做出最终的判断。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Blending\n",
    "\n",
    "弱化版的stacking，下一层不使用模型，相当于对结果做线性加权或者投票。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Boosting\n",
    "\n",
    "考得不好的原因是什么？\n",
    "\n",
    "1. 还不够努力，练习题需要多次学习---重复迭代和训练\n",
    "2. 时间分配要合理，要多练习之前做错的题---每次分配给分错的样本更高的权重\n",
    "3. 我不聪明，但是脚踏实地，用最简单的知识不断积累，成为专家---最简单的分类器叠加\n",
    "\n",
    "以 Adaboost 为例"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-21T10:07:13.589696Z",
     "start_time": "2019-09-21T10:07:13.467509Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.7513623631270689\n"
     ]
    }
   ],
   "source": [
    "from sklearn.ensemble import AdaBoostClassifier\n",
    "num_trees = 25\n",
    "kfold = model_selection.KFold(n_splits=5, random_state=10)\n",
    "model = AdaBoostClassifier(n_estimators=num_trees, random_state=10)\n",
    "result = model_selection.cross_val_score(model, X, Y, cv=kfold)\n",
    "print(result.mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": true,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": true,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": true
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
